{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip3 install malaya-gpu -U --no-deps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2037249"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('filtered-dumping-wiki.txt') as fopen:\n",
    "    data = list(filter(None, fopen.read().split('\\n')))\n",
    "    \n",
    "data = [i for i in data if len(i) >= 2]\n",
    "\n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Dirk Jan Klaas \"Klaas-Jan\" Huntelaar (lahir 12 Ogos 1983) merupakan pemain bola sepak Belanda yang bermain di posisi penyerang.'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:root:Load quantized model will cause accuracy drop.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/malaya/function/__init__.py:74: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/malaya/function/__init__.py:74: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/malaya/function/__init__.py:76: The name tf.GraphDef is deprecated. Please use tf.compat.v1.GraphDef instead.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/malaya/function/__init__.py:76: The name tf.GraphDef is deprecated. Please use tf.compat.v1.GraphDef instead.\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/malaya/function/__init__.py:51: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/malaya/function/__init__.py:51: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/malaya/function/__init__.py:66: The name tf.InteractiveSession is deprecated. Please use tf.compat.v1.InteractiveSession instead.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/malaya/function/__init__.py:66: The name tf.InteractiveSession is deprecated. Please use tf.compat.v1.InteractiveSession instead.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import malaya\n",
    "\n",
    "model = malaya.dependency.transformer(model = 'xlnet', quantized = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = malaya.preprocessing.TOKENIZER().tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def reset_t(tagging):\n",
    "    t = tagging.copy()\n",
    "    for i in range(len(t)):\n",
    "        t[i] = [t[i][0], 2]\n",
    "    return t\n",
    "\n",
    "def augment_3(t, graph, selected = ['compound', 'flat']):\n",
    "    l = list(graph.nodes.items())\n",
    "    for no, n in enumerate(l[1:]):\n",
    "        n = n[1]\n",
    "        if n['rel'] in selected and n['address'] - 1 == n['head']:\n",
    "            c = t[n['head'] - 1].copy()\n",
    "            c[1] = 3\n",
    "            t[n['head'] - 1] = [t[n['address'] - 1][0], 3]\n",
    "            t[n['address'] - 1] = c\n",
    "        \n",
    "    return t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "d_object, tagging, indexing = model.predict(' '.join(tokenizer(data[0])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "graph = malaya.dependency.dependency_graph(tagging, indexing)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = reset_t(tagging)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['Dirk', 2],\n",
       " ['Jan', 2],\n",
       " ['Klaas', 2],\n",
       " ['\"', 2],\n",
       " ['Klaas-Jan', 2],\n",
       " ['\"', 2],\n",
       " ['Huntelaar', 2],\n",
       " ['(', 2],\n",
       " ['lahir', 2],\n",
       " ['12', 2],\n",
       " ['Ogos', 2],\n",
       " ['1983', 2],\n",
       " [')', 2],\n",
       " ['merupakan', 2],\n",
       " ['bola', 3],\n",
       " ['sepak', 3],\n",
       " ['Belanda', 3],\n",
       " ['pemain', 3],\n",
       " ['yang', 2],\n",
       " ['bermain', 2],\n",
       " ['di', 2],\n",
       " ['posisi', 2],\n",
       " ['penyerang', 2],\n",
       " ['.', 2]]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "augment_3(t, graph)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████▉| 4992/5000 [07:39<00:00, 10.21it/s]"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "results = []\n",
    "for i in tqdm(range(len(data[:5000]))):\n",
    "    try:\n",
    "        d_object, tagging, indexing = model.predict(' '.join(tokenizer(data[i])))\n",
    "        graph = malaya.dependency.dependency_graph(tagging, indexing)\n",
    "        t = reset_t(tagging)\n",
    "        results.append((t.copy(), augment_3(t, graph)))\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "with open('dataset-tatabahasa.pkl', 'wb') as fopen:\n",
    "    pickle.dump(results, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
